import sqlite3
import requests
import re


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit'
                  '/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'Referer': 'http://www.zhihu.com/articles',
    'cookie': 'zap=eb7ed6f9-455d-44ca-9e86-f72ef90ad1b3; _xsrf=9OTADnJ91McFo8gJ9kKgXIXDU80S'
              'moF1; d_c0="AKBekymWSBGPTjaZS0YwagqhSYviAkv6rgM=|1589708978"; capsion_ticket'
              '="2|1:0|10:1589708978|14:capsion_ticket|44:NzkzZTFlNmJjYWVlNDUwNjgxZGVmNDE2M'
              'mY4MjcwNmI=|8ce799b93a24f2ee9269f1a527c2855d1e06266b0e8c11b977a05dd51f812310'
              '"; _ga=GA1.2.1778484913.1589708978; r_cap_id="ZGU0NTZhN2IxN2Y3NDRlYjkyM2IwMT'
              'I1OGNmNzM4ZTc=|1589708996|14bb56425c65d9deaa97ace4c85ec8e714ac7df9"; cap_id='
              '"ZTU0M2NkNzRmNjIxNDE0NmFjOTBiNmYyYjY1MmRjNGU=|1589708996|a27d273bac48a7e04ae'
              'a0075769d81b6d71e4762"; l_cap_id="YjM3NzM2YzY1NzIwNDg0ZjhkZTk3ZWZhOTdlMjE4NW'
              'E=|1589708996|43a468ef2f8141b41a76e782f34c5f679af7f2de"; z_c0=Mi4xMWtkV0d3QU'
              'FBQUFBb0Y2VEtaWklFUmNBQUFCaEFsVk55RmF1WHdERzNuQ1h1V1praWEwRzhsZ3pPdHhlMVZDUj'
              'ZB|1589709000|3e1420f541cb541c1f506e777f797daf20658ad5; tst=r; _gid=GA1.2.17'
              '90679598.1589972707; q_c1=37a7aebe36fe488092d67690c701cac1|1589977027000|158'
              '9977027000; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1589977027,1589977212,15'
              '89977287,1589978422; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1589978422; _g'
              'at_gtag_UA_149949619_1=1; SESSIONID=kqwdAfDw2zVqsK47ggfBrdmSNhoS47m5WNmwMKSV'
              'xiV; JOID=V1sUAU794xj2k_e4RPoFiSuVavJRkJBsldKX_jDPn0yE57yKcc_H2a-V87lA9bN1Nc'
              'G_r0L3a_4uGTnTGmTIL28=; osd=VVgRAEj_4B33lfW7QfsDiyiQa_RTk5Vtk9CU-zHJnU-B5rqI'
              'csrG362W9rhG97BwNMe9rEf2bfwtHDjVGGfNLmk=; KLBRSID=81978cf28cf03c58e07f705c15'
              '6aa833|1589978429|1589978420'

}


def get_data(param):

    while True:
        page_data = param['paging']
        next_url = page_data['next']  # 下一条要爬的url *

        if page_data['is_end'] is True:  # 如果是最后一个url ,则结束
            break

        all_data = param['data']
        for i in range(5):
            target_data = all_data[i]
            target_data = target_data['target']

            author_data = target_data['author']

            content = target_data['content']
            content = re.findall('[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]',content)
            content = ''.join(content)  # 文章内容 *
            try:
                people = author_data['followers_count']  # 点赞人数 *
            except:
                try:
                    people = target_data['question']
                    people = people['author']
                    people = people['followers_count']
                except:
                    people = 0

            author_name = author_data['name']  # 作者名字 *
            save_data(content=content, people=people, name=author_name)
        main(url=next_url)  # 传入 next_url


def save_data(content, people, name):
    conn = sqlite3.connect('data.db')
    insert = "insert into zhihu(name, people_nums, text) values ('%s', '%s', '%s') " % (name, people, content)
    conn.execute(insert)
    conn.commit()
    conn.close()
    print('%s 的文章 保存成功' % name)


def main(url):

    if url is None:
        url = 'https://www.zhihu.com/api/v3/feed/topstory/recommend?session_token=b057e92070219b3f3260' \
              'aac4940b7f3f&desktop=true&page_number=4&limit=6&action=pull&ad_interval=-1&before_id=17'

    html = requests.get(url, headers=headers)

    if html.status_code == 200:

        get_data(html.json())

    else:
        print('访问网页错误')


if __name__ == '__main__':
    main(url=None)


